import pandas as pd
import numpy as np
import os
# import xlrd
from openpyxl import load_workbook
import statistics
import datetime as dt
import re
import sweetviz as sv
from math import log2
from matplotlib import pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.preprocessing import minmax_scale
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import models, layers
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn import svm
from sklearn.metrics import confusion_matrix, classification_report
import xgboost
#define data
data = df['EnrolledStatus'].value_counts().values
labels = list(df['EnrolledStatus'].value_counts().index)
#define Seaborn color palette to use
colors = sns.color_palette('pastel')[:2]
#create pie chart
plt.pie(data, labels = labels, colors = colors, autopct='%.0f%%')
plt.show()
There are 68% students enrol, whereas 32% students not enrol
dfTemp = df.copy()
dfTemp['Age'] = minmax_scale(dfTemp['Age'].values, feature_range=(0, 1), axis=0, copy=True)
sns.set_theme();
plt.figure(figsize=(20,10))
plt.rcParams['font.size'] = '25'
x1 = dfTemp[
(dfTemp['EnrolledStatus'] == 'Not Enrolled')]['Age'].values
x2 = dfTemp[
(dfTemp['EnrolledStatus'] == 'Enrolled')]['Age'].values
ax = sns.distplot(x1)
ax = sns.distplot(x2)
ax.set_xlabel('Age', fontsize=20)
ax.set_ylabel('Density', fontsize=20)
ax.legend(["Not Enrol", "Enrol"], fontsize = 20)
ax.tick_params(axis='both', which='major', labelsize = 20)
ax.tick_params(axis='both', which='minor', labelsize = 20)
ax.set_xlim([0.5, 0.6])
c:\users\jiunshyangoh\documents\testenv\lib\site-packages\seaborn\distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) c:\users\jiunshyangoh\documents\testenv\lib\site-packages\seaborn\distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
(0.5, 0.6)
Distribution of age are quite similar for students who enrol and not enrol. Age may not be a good predictor
fig = px.bar(resultDF, x = "Factor", y = "Score(IG)", text = 'Score(IG)'
,title = "Score(IG) of each factor")
fig.update_layout(xaxis_tickangle = 90, width = 1000,
height=700,)
fig.update_traces(textposition='outside', textfont_size=20)
fig.show()
The table is sorted based on calculation of IG as factor importance. We choose top 5 factors arbitrarily. Subsequently, we try 6, 7 and 8 as long as its score is positive
dfResult_5Features
| Threshold | False Positive Rate | Recall | Conversion Rate | Model | Type | |
|---|---|---|---|---|---|---|
| 0 | 0.1 | 0.98 | 0.99 | 0.68 | Random Forest | 5Features |
| 1 | 0.2 | 0.96 | 0.98 | 0.68 | Random Forest | 5Features |
| 2 | 0.3 | 0.91 | 0.96 | 0.69 | Random Forest | 5Features |
| 3 | 0.4 | 0.87 | 0.93 | 0.69 | Random Forest | 5Features |
| 4 | 0.5 | 0.79 | 0.87 | 0.70 | Random Forest | 5Features |
| 5 | 0.6 | 0.63 | 0.76 | 0.71 | Random Forest | 5Features |
| 6 | 0.7 | 0.43 | 0.55 | 0.72 | Random Forest | 5Features |
| 7 | 0.8 | 0.23 | 0.30 | 0.73 | Random Forest | 5Features |
| 8 | 0.9 | 0.08 | 0.11 | 0.75 | Random Forest | 5Features |
| 9 | 0.1 | 1.00 | 1.00 | 0.68 | XGB | 5Features |
| 10 | 0.2 | 0.99 | 1.00 | 0.68 | XGB | 5Features |
| 11 | 0.3 | 0.97 | 0.99 | 0.68 | XGB | 5Features |
| 12 | 0.4 | 0.92 | 0.98 | 0.69 | XGB | 5Features |
| 13 | 0.5 | 0.87 | 0.95 | 0.69 | XGB | 5Features |
| 14 | 0.6 | 0.72 | 0.85 | 0.71 | XGB | 5Features |
| 15 | 0.7 | 0.35 | 0.49 | 0.75 | XGB | 5Features |
| 16 | 0.8 | 0.08 | 0.13 | 0.78 | XGB | 5Features |
| 17 | 0.9 | 0.00 | 0.00 | 0.85 | XGB | 5Features |
| 18 | 0.1 | 0.99 | 1.00 | 0.68 | 2LayersDL | 5Features |
| 19 | 0.2 | 0.96 | 0.99 | 0.68 | 2LayersDL | 5Features |
| 20 | 0.3 | 0.92 | 0.97 | 0.69 | 2LayersDL | 5Features |
| 21 | 0.4 | 0.87 | 0.94 | 0.69 | 2LayersDL | 5Features |
| 22 | 0.5 | 0.80 | 0.89 | 0.70 | 2LayersDL | 5Features |
| 23 | 0.6 | 0.66 | 0.79 | 0.72 | 2LayersDL | 5Features |
| 24 | 0.7 | 0.48 | 0.61 | 0.73 | 2LayersDL | 5Features |
| 25 | 0.8 | 0.23 | 0.31 | 0.73 | 2LayersDL | 5Features |
| 26 | 0.9 | 0.05 | 0.07 | 0.74 | 2LayersDL | 5Features |
#random forest
dfTemp = dfResult_5Features.copy()
dfTemp.reset_index(drop = True, inplace = True)
yCol = "Recall"
sns.set(font_scale = 2)
plt.figure(figsize=(20,10))
sns.lineplot(x = "Threshold", y = yCol, hue = 'Model', data = dfTemp.sort_values([yCol], ascending = False),
alpha=0.5, palette = ["blue", "red", "green"])
<AxesSubplot:xlabel='Threshold', ylabel='Recall'>
Recall wise, XGB perform better than DL and RF prior to threshold = 0.7